# -*- coding: utf-8 -*-
"""
Created on Tue Jul  9 17:04:25 2024

# Code to calculate wage-bill weighted exposure share of industries
# Code adapted from Eloundou et al. (2024) replication code

@author: aharding6
"""

# Import packages
from os import chdir
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Set working directory
Path = "G:/My Drive/Research/AI and Energy/Drafting/ERL/Replication/"
Path = Path+"Data/Input/GPTs-are-GPTs-main/"
chdir(Path)




##########################################################################
# Step 1: Import and prep exposure scores (Task-level)
##########################################################################


def read_tsv(path):
    """
    Load TSV using pandas
    """
    return pd.read_csv(path, delimiter="\t")

# Load exposure scores
bls_oews = pd.read_csv("data/national_May2021_dl.csv")
full_labels = read_tsv("data/full_labelset.tsv")
full_labels.rename(
    columns={
        "gpt4_exposure_alt_rubric": "gpt4_alt_exposure",
        "human_exposure_agg": "human_exposure",
    },
    inplace=True,
)
df_tasks = read_tsv("data/full_onet_data.tsv")


###########
# Translate Categorical Data
###########

alpha_score_map = {"E0": 0.0, "E1": 1.0, "E2": 0.0, "E3": 0.0}
beta_score_map = {"E0": 0.0, "E1": 1.0, "E2": 0.5, "E3": 0.5}
gamma_score_map = {"E0": 0.0, "E1": 1.0, "E2": 1.0, "E3": 1.0}
t_score_map = {"T0": 0.0, "T1": 0.25, "T2": 0.5, "T3": 0.75, "T4": 1.0}

for prefix in ["gpt4", "human"]:
    full_labels[f"{prefix}_alpha"] = full_labels[f"{prefix}_exposure"].apply(
        lambda x: alpha_score_map[x]
    )
    full_labels[f"{prefix}_beta"] = full_labels[f"{prefix}_exposure"].apply(
        lambda x: beta_score_map[x]
    )
    full_labels[f"{prefix}_gamma"] = full_labels[f"{prefix}_exposure"].apply(
        lambda x: gamma_score_map[x]
    )

full_labels["gpt4_alt_beta"] = full_labels["gpt4_alt_exposure"].apply(
    lambda x: beta_score_map[x]
)

full_labels["automation"] = full_labels["gpt4_automation"].apply(
    lambda x: t_score_map[x]
)


## Create binary for exposure a la Acemoglu 2024 - ARH
for label in ["gpt4_beta","human_beta","automation"]:
    # Create binary where >0.5 is coded as 1 and <=0.5 is coded as 0
    full_labels[label+'_binary'] = 0
    full_labels.loc[full_labels[label]>=0.75,label+'_binary'] = 1
    # Automation is more granular (more codes) so can create range of binaries
    if label=="automation":
        # Create upper bound where anything >0 is coded as 1
        full_labels[label+'_binary_upper'] = 0
        full_labels.loc[full_labels[label]>=0.25,label+'_binary_upper'] = 1
        # Create lower bound where anything ==1 is coded as 1
        full_labels[label+'_binary_lower'] = 0
        full_labels.loc[full_labels[label]==1,label+'_binary_lower'] = 1





##########################################################################
# Step 2: Aggregate task-level exposure scores to occupation level (6-digit SOC)
##########################################################################


###########
# Add in weights for aggregation
###########

df_taskratings = read_tsv(
    "https://www.onetcenter.org/dl_files/database/db_27_2_text/Task%20Ratings.txt"
)

df_taskratings_relevant = df_taskratings[df_taskratings["Scale ID"].isin(["RT", "IM"])][
    ["O*NET-SOC Code", "Task ID", "Scale ID", "Data Value"]
]
dfr_taskratings = pd.pivot(
    df_taskratings_relevant, index=["O*NET-SOC Code", "Task ID"], columns=["Scale ID"]
).reset_index()
dfr_taskratings.columns = ["O*NET-SOC Code", "Task ID", "importance", "relevance"]

# converting the core / supplemental rating to a numerical weight where core is worth 2x the taskweight of a supplemental task.
df_tasks["coreweight"] = df_tasks["Task Type"].map(
    {"Core": 2, "Supplemental": 1, np.nan: 1}
)
df_tasks["equalweight"] = df_tasks["Task Type"].map(
    {"Core": 1, "Supplemental": 1, np.nan: 1}
)
task_tmp = pd.merge(df_tasks, dfr_taskratings, how="left")


impWeightOcc = (
    task_tmp[["O*NET-SOC Code", "importance", "relevance", "coreweight", "equalweight"]]
    .groupby("O*NET-SOC Code")
    .sum()
    .reset_index()
)
impWeightOcc.rename(
    columns={
        "importance": "impTotal",
        "relevance": "relTotal",
        "coreweight": "coreweightTotal",
        "equalweight": "equalweightTotal",
    },
    inplace=True,
)
task_tmp_weighted = pd.merge(task_tmp, impWeightOcc, how="left", on="O*NET-SOC Code")
task_tmp_weighted["importance_weight"] = (
    task_tmp_weighted["importance"] / task_tmp_weighted["impTotal"]
)
task_tmp_weighted["relevance_weight"] = (
    task_tmp_weighted["relevance"] / task_tmp_weighted["relTotal"]
)
task_tmp_weighted["core_weight"] = (
    task_tmp_weighted["coreweight"] / task_tmp_weighted["coreweightTotal"]
)
task_tmp_weighted["equal_weight"] = (
    task_tmp_weighted["equalweight"] / task_tmp_weighted["equalweightTotal"]
)



## merge weights to the label frame
full_labels_weight = pd.merge(
    full_labels,
    task_tmp_weighted[
        [
            "O*NET-SOC Code",
            "Task ID",
            "importance_weight",
            "relevance_weight",
            "core_weight",
            "equal_weight",
        ]
    ],
    how="left",
)

## scoring and labeling
full_labels_weight["gpt_relevant"] = full_labels_weight["gpt_3_relevant"].astype(float)





################
# Aggregate to occupations (8-digit SOC)
################

def weighted_mean(df, groupfields, aggfields, weightfield):
    df2 = df[aggfields].multiply(df[weightfield], axis="index")
    aa = df[[weightfield] + groupfields]
    df3 = df2.join(aa)
    dfg = df3[groupfields + aggfields].groupby(groupfields).sum().reset_index()
    return dfg

## Must be one of {'equal_weight','core_weight','relevance_weight','importance_weight'}
weight_field = "equal_weight"

## How we are grouping tasks (by occupation)
group_fields = ["O*NET-SOC Code", "Title"]

## These are the fields that will be aggregated to the occupation-level and used throughout
rating_fields = [
    "gpt4_beta",
    "human_beta",
    "automation",
    "gpt4_beta_binary",
    "human_beta_binary",
    "automation_binary",
    "automation_binary_upper",
    "automation_binary_lower",
]

## Run the weighting
occ_level = weighted_mean(full_labels_weight, group_fields, rating_fields, weight_field)


################
# Aggregate to occupations (6-digit SOC)
################

# Aggregate from 8-digit SOC to 6-digit SOC taking simple average of exposure across occupations
occ_level["OCC_CODE"] = occ_level["O*NET-SOC Code"].str.slice(start=0, stop=7)

scores_fields_and_id = [
    "OCC_CODE",
] + rating_fields
occ_lvl6 = (
    occ_level[[item for item in occ_level if item in scores_fields_and_id]]
    .groupby("OCC_CODE")
    .mean()
    .reset_index()
)





##########################################################################
# Step 3: Create wage-bill weighted exposure score for occupation (5-digit SOC) by industry (NAICS)
##########################################################################

##########
# Prep industry data
##########

## need to first compile industry datasets
# Function takes (partial) filename and years as inputs and exports wage bill weights at NAICSxSOC 5-digit and 6-digit level
def proc_ind_file(file_in,yrs):
    # Read in gdp deflator to convert to 2017USD
    gdp_def = pd.read_csv("../A191RD3A086NBEA.csv")
    gdp_def['Year'] = gdp_def['DATE'].str.slice(start=2,stop=4)
    gdp_def['A191RD3A086NBEA'] = gdp_def['A191RD3A086NBEA']/100 # Convert to fraction for scaling
    # Get wage-bill weights for each occupation, averaging over 2019-2022 from BLS OEWS
    # Loop over years and import data
    for i in yrs:
        raw_bls = pd.read_csv("data/bls_oes/oesm"+str(i)+file_in+str(i)+"_dl.csv")
        # character replacements. For # there needs to be a replacement at the upper end of the range per BLS.
        raw_bls.replace(to_replace=["*", "**"], value=np.nan, inplace=True)
        raw_bls.replace(
            to_replace=["#"], value=208000, inplace=True
        )  # bigger than $208k/year gets a #.
        raw_bls.replace(to_replace=",", value="", regex=True, inplace=True)
        
        # commas and non-numeric fields
        numeric_fields = [
            "TOT_EMP",
            "A_MEAN"
        ] 
        for field in numeric_fields:
            raw_bls[field] = pd.to_numeric(raw_bls[field])
        
        # Create annual wage bill using employment and annual average wage deflated using GDP deflator
        raw_bls['wage'+str(i)] = raw_bls['TOT_EMP']*raw_bls['A_MEAN']/gdp_def.loc[gdp_def.Year==str(i),"A191RD3A086NBEA"].values[0]
        # Get total wages (if aggregate than just single value. If by NAICS have to sum across codes) deflated using GDP deflator
        if file_in=="nat/national_M20":
            totwage = (raw_bls.loc[raw_bls['OCC_CODE']=="00-0000",'TOT_EMP']*raw_bls.loc[raw_bls['OCC_CODE']=="00-0000",'A_MEAN']).values[0]/gdp_def.loc[gdp_def.Year==str(i),"A191RD3A086NBEA"].values[0]
        else:
            totwage = (raw_bls.loc[raw_bls['OCC_CODE']=="00-0000",'TOT_EMP']*raw_bls.loc[raw_bls['OCC_CODE']=="00-0000",'A_MEAN']).sum()/gdp_def.loc[gdp_def.Year==str(i),"A191RD3A086NBEA"].values[0]
            
        raw_bls['totwage'+str(i)] = totwage
        raw_bls['wagebill_weight'+str(i)] = raw_bls['wage'+str(i)]/raw_bls['totwage'+str(i)]
        raw_bls['laborsharegdp_weight'+str(i)] = raw_bls['wage'+str(i)]/23e12
        # Add NAICS columns if for aggregate economy
        if file_in=="nat/national_M20":
            raw_bls['NAICS'] = "00"
        # print("# SOC 6-digit in BLS OEWS data = "+str(len(raw_bls.loc[raw_bls['O_GROUP']=="detailed","OCC_CODE"].unique())))
        raw_bls_detailed = raw_bls.loc[raw_bls['O_GROUP']=="detailed",]
        raw_bls_broad = raw_bls.loc[raw_bls['O_GROUP']=="broad",]
        
        raw_bls_detailed['NAICS_OCC_CODE'] = raw_bls_detailed['NAICS'].astype(str)+raw_bls_detailed['OCC_CODE']
        raw_bls_broad['NAICS_OCC_CODE'] = raw_bls_broad['NAICS'].astype(str)+raw_bls_broad['OCC_CODE']
        
        # Merge years
        raw_bls_detailed_tmp = raw_bls_detailed[["NAICS_OCC_CODE",'wage'+str(i),'totwage'+str(i),'wagebill_weight'+str(i),'laborsharegdp_weight'+str(i)]].copy()
        raw_bls_broad_tmp = raw_bls_broad[["NAICS_OCC_CODE",'wage'+str(i),'totwage'+str(i),'wagebill_weight'+str(i),'laborsharegdp_weight'+str(i)]].copy()
        
        if i==yrs[0]:
            raw_bls_detailed_final = raw_bls_detailed_tmp.copy()
            raw_bls_broad_final = raw_bls_broad_tmp.copy()
        else:
            raw_bls_detailed_final = pd.merge(raw_bls_detailed_final,raw_bls_detailed_tmp,on=["NAICS_OCC_CODE"],how="outer")
            raw_bls_broad_final = pd.merge(raw_bls_broad_final,raw_bls_broad_tmp,on=["NAICS_OCC_CODE"],how="outer")
        
    # Calculate average wagebill weight by occupation across years 
    # First calculate by averaging wage bill and total wage bill across years and dividing those averages
    # Second calculate by averaging wage bill weight across years
    raw_bls_detailed_final['wage'] = raw_bls_detailed_final[['wage'+str(yr) for yr in yrs]].mean(axis=1,skipna=True)
    raw_bls_detailed_final['wagebill_weight'] = raw_bls_detailed_final[['wagebill_weight'+str(yr) for yr in yrs]].mean(axis=1,skipna=True)
    raw_bls_detailed_final['laborsharegdp_weight'] = raw_bls_detailed_final[['laborsharegdp_weight'+str(yr) for yr in yrs]].mean(axis=1,skipna=True)
    raw_bls_broad_final['wage'] = raw_bls_broad_final[['wage'+str(yr) for yr in yrs]].mean(axis=1,skipna=True)
    raw_bls_broad_final['wagebill_weight'] = raw_bls_broad_final[['wagebill_weight'+str(yr) for yr in yrs]].mean(axis=1,skipna=True)
    raw_bls_broad_final['laborsharegdp_weight'] = raw_bls_broad_final[['laborsharegdp_weight'+str(yr) for yr in yrs]].mean(axis=1,skipna=True)
    
    # Unbundle codes and merge back NAICS titles
    code_lens = len(raw_bls_detailed_final['NAICS_OCC_CODE'].iloc[0])
    raw_bls_detailed_final['NAICS'] = raw_bls_detailed_final['NAICS_OCC_CODE'].str.slice(start =0 ,stop=code_lens-7)
    raw_bls_detailed_final['OCC_CODE'] = raw_bls_detailed_final['NAICS_OCC_CODE'].str.slice(start = code_lens-7,stop=code_lens)
    
    code_lens = len(raw_bls_detailed_final['NAICS_OCC_CODE'].iloc[0])
    raw_bls_broad_final['NAICS'] = raw_bls_broad_final['NAICS_OCC_CODE'].str.slice(start = 0,stop=code_lens-7)
    raw_bls_broad_final['OCC_CODE'] = raw_bls_broad_final['NAICS_OCC_CODE'].str.slice(start = code_lens-7,stop=code_lens)
    
    
    return raw_bls_broad_final,raw_bls_detailed_final


# Aggregate economy 
ind_df_agg, ind_df_det_agg = proc_ind_file("nat/national_M20",yrs=range(19,23))
    
# 2-digit(-ish) NAICS
ind_df_2, ind_df_det_2 = proc_ind_file("in4/natsector_M20",yrs=range(19,23))
    
# 3-digit NAICS
ind_df_3, ind_df_det_3 = proc_ind_file("in4/nat3d_M20",yrs=range(19,23))
    
# 4-digit NAICS
ind_df_4, ind_df_det_4 = proc_ind_file("in4/nat4d_M20",yrs=range(19,23))





##########
# Weight exposure scores by wage-bill weights from industry data
##########


## need to merge occupation exposure scores with wage weights
# Function takes industry wage weights at 6-digit and 5-digit SOC level and exposure scoress at 5-digit SOC level as inputs and exports merged data at NAICSxSOC 5-digit level
def exp_weight(ind_df,ind_df_det,occ_lvl):  
    # Merge occupational exposure scores with wage weights at 6-digit SOC
    occ_lvl = pd.merge(
        ind_df_det,
        occ_lvl,
        how="left",
        on="OCC_CODE",
    )     
    
    # Calculate wage-bill weighted exposure at 6-digit SOC
    for rating in rating_fields:
        occ_lvl['expshare_'+rating+'_wagebill_weight'] = occ_lvl[rating]*occ_lvl['wagebill_weight']
        occ_lvl['expshare_'+rating+'_laborsharegdp_weight'] = occ_lvl[rating]*occ_lvl['laborsharegdp_weight']
        # print("Aggregate wage-bill weighted exposure for "+rating+" = "+str(occ_lvl['expshare_'+rating].sum()))
    
    # Aggregate up to 5-digit SOC, summing wage-bill weighted exposure when data exists at 6-digit SOC and taking simple average of exposure to use with 5-digit wage data when not
    occ_lvl["OCC_CODE"] = occ_lvl["OCC_CODE"].str.slice(start=0, stop=6)+"0"
    
    sum_fields = ['NAICS','OCC_CODE','wagebill_weight','laborsharegdp_weight']+['expshare_'+rating+'_wagebill_weight' for rating in rating_fields]+['expshare_'+rating+'_laborsharegdp_weight' for rating in rating_fields]
    occ_lvlsum = (
        occ_lvl[[item for item in occ_lvl if item in sum_fields]]
        .groupby(['NAICS',"OCC_CODE"])
        .sum(min_count=1)
        .reset_index()
    )
    mean_fields = ['NAICS',"NAICS_TITLE",'OCC_CODE']+rating_fields
    occ_lvlmean = (
        occ_lvl[[item for item in occ_lvl if item in mean_fields]]
        .groupby(['NAICS',"OCC_CODE"])
        .mean()
        .reset_index()
    )
    occ_lvl = pd.merge(occ_lvlmean,occ_lvlsum,on=['NAICS',"OCC_CODE"])
    
    # Fill in wagebill weight for occupations missing data at 6-digit soc using wage data at 5-digit soc
    for i in occ_lvl.loc[occ_lvl['wagebill_weight'].isnull(),].index:
        nai = occ_lvl['NAICS'].iloc[i]
        occ = occ_lvl['OCC_CODE'].iloc[i]
        if occ in list(ind_df['OCC_CODE']) and nai in list(ind_df.loc[ind_df['OCC_CODE']==occ,'NAICS']):
            occ_lvl["wagebill_weight"].iloc[i] = ind_df.loc[((ind_df['NAICS']==nai)&(ind_df['OCC_CODE']==occ)),'wagebill_weight'].values[0]
            occ_lvl[["expshare_"+rating+'_wagebill_weight' for rating in rating_fields]].iloc[i] = occ_lvl[[rating for rating in rating_fields]].iloc[i]*occ_lvl["wagebill_weight"].iloc[i]
    
    # Fill in wagebill weight for occupations missing data at 6-digit soc using wage data at 5-digit soc
    for i in occ_lvl.loc[occ_lvl['laborsharegdp_weight'].isnull(),].index:
        nai = occ_lvl['NAICS'].iloc[i]
        occ = occ_lvl['OCC_CODE'].iloc[i]
        if occ in list(ind_df['OCC_CODE']) and nai in list(ind_df.loc[ind_df['OCC_CODE']==occ,'NAICS']):
            occ_lvl["laborsharegdp_weight"].iloc[i] = ind_df.loc[((ind_df['NAICS']==nai)&(ind_df['OCC_CODE']==occ)),'laborsharegdp_weight'].values[0]
            occ_lvl[["expshare_"+rating+'_laborsharegdp_weight' for rating in rating_fields]].iloc[i] = occ_lvl[[rating for rating in rating_fields]].iloc[i]*occ_lvl["laborsharegdp_weight"].iloc[i]
    
    return occ_lvl



ind_occ_df_agg_weight = exp_weight(ind_df_agg, ind_df_det_agg, occ_lvl6)

ind_occ_df_2_weight = exp_weight(ind_df_2, ind_df_det_2, occ_lvl6)

ind_occ_df_3_weight = exp_weight(ind_df_3, ind_df_det_3, occ_lvl6)

ind_occ_df_4_weight = exp_weight(ind_df_4, ind_df_det_4, occ_lvl6)




# Aggregate to Industry level (Drops unweighted exposure scores)
def ind_sum(ind_occ_df):
    sum_fields = ['NAICS','wagebill_weight']+['expshare_'+rating+'_wagebill_weight' for rating in rating_fields]+['expshare_'+rating+'_laborsharegdp_weight' for rating in rating_fields]
    ind_df = (
        ind_occ_df[[item for item in ind_occ_df if item in sum_fields]]
        .groupby(['NAICS'])
        .sum(min_count=1)
        .reset_index()
    )
    return ind_df



ind_df_agg_weight = ind_sum(ind_occ_df_agg_weight)

ind_df_2_weight = ind_sum(ind_occ_df_2_weight)

ind_df_3_weight = ind_sum(ind_occ_df_3_weight)

ind_df_4_weight = ind_sum(ind_occ_df_4_weight)

# Export data
ind_df_agg_weight.to_csv("../../Output/Occupation_wagebillweighted_exposure.csv",index=False)

ind_df_2_weight.to_csv("../../Output/NAICS2_Occupation_wagebillweighted_exposure.csv",index=False)

ind_df_3_weight.to_csv("../../Output/NAICS3_Occupation_wagebillweighted_exposure.csv",index=False)

ind_df_4_weight.to_csv("../../Output/NAICS4_Occupation_wagebillweighted_exposure.csv",index=False)






